library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(formatR)
library(skimr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(lm.beta) 
library(lsr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(plot3D)
library(dotwhisker)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard
knitr::opts_chunk$set(
echo = TRUE, # render output of code chunks,
warning = TRUE, # do not suppress "warnings"
message = TRUE, # do not suppress "messages"
title="#",
comment = "##", # prefix for comment lines
tidy = TRUE,
tidy.opts = list(blank = FALSE, width.cutoff = 75),
fig.path = "Kim et_al_photos/", # name of folder for images
fig.align = "center" # centers any images on the page
)
#Introduction: This paper is focused on using Magnetic Resonance Imaging (MR) to determine the location of tumors, specifically of the the Estrogen-repceptor positive (ER) and Triple Negative (TN) subtypes. Triple Negative breast cancer is known to be highly aggressive and invasive, with outcomes that typically differ from those with ER subtypes. This paper utilized multiple regression analysis to determine which clincopathological predictor were significant in predicting cancer position and found that age, mammographic density, subtype, and axilary nodal status were significantly associated with the y-distances from the chest wall (both absolute and normalized). In this report I explore several of those variables. Unfortunately, some of the significant variables are not included in the raw dataset. 
# Begin working on descriptive Stats
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(d)
## # A tibble: 6 × 15
##      ID Size_Pathol_cm_ Subtype Qudrants2_1_UOQ_2_UIQ_3_… MR_Location_ML__1__me…
##   <dbl> <chr>             <dbl>                     <dbl>                  <dbl>
## 1     1 1.50                  1                         2                      2
## 2     2 3.00                  1                         2                      2
## 3     3 1.60                  1                         1                      2
## 4     4 1.80                  1                         5                      2
## 5     5 3.00                  1                         1                      2
## 6     6 3.50                  3                         1                      2
## # … with 10 more variables:
## #   MR_Location_AP__1__anterior_2__middle_3__posterior_ <dbl>,
## #   Y_distance <dbl>, X_normalized <dbl>, Y_normalized <dbl>,
## #   Z_normalized <dbl>, MG_grade <dbl>, X_distance <dbl>, Z_distance <dbl>,
## #   Palpability <dbl>, Histologic_full <dbl>
skim(d)  #getting some quick stats before filtering... based on raw data a lot of filtering needs to occur but useful for a couple variables
Data summary
Name d
Number of rows 1102
Number of columns 15
_______________________
Column type frequency:
character 1
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Size_Pathol_cm_ 0 1 4 6 0 67 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ID 0 1 551.50 318.26 1.0 276.25 551.5 826.75 1102.0 ▇▇▇▇▇
Subtype 0 1 1.46 0.84 1.0 1.00 1.0 1.00 3.0 ▇▁▁▁▂
Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping 0 1 2.05 1.25 1.0 1.00 2.0 3.00 5.0 ▇▅▂▁▂
MR_Location_ML__1__medial_2__central_3__lateral_ 0 1 2.20 0.77 1.0 2.00 2.0 3.00 3.0 ▅▁▇▁▇
MR_Location_AP__1__anterior_2__middle_3__posterior_ 0 1 2.37 0.64 1.0 2.00 2.0 3.00 3.0 ▂▁▇▁▇
Y_distance 0 1 2.19 1.88 0.0 0.70 1.8 3.18 10.1 ▇▃▂▁▁
X_normalized 0 1 0.05 0.18 -0.4 -0.10 0.1 0.20 0.5 ▁▅▇▇▁
Y_normalized 0 1 0.24 0.18 0.0 0.10 0.2 0.40 0.9 ▇▇▅▁▁
Z_normalized 0 1 0.06 0.13 -0.4 0.00 0.1 0.20 0.5 ▁▃▇▃▁
MG_grade 0 1 2.78 0.84 1.0 2.00 3.0 3.00 4.0 ▂▃▁▇▃
X_distance 0 1 0.45 1.51 -4.2 -0.70 0.5 1.60 5.5 ▁▅▇▅▁
Z_distance 0 1 0.97 2.05 -5.5 -0.38 1.0 2.50 7.1 ▁▃▇▅▁
Palpability 0 1 0.60 0.49 0.0 0.00 1.0 1.00 1.0 ▅▁▁▁▇
Histologic_full 0 1 2.30 0.67 1.0 2.00 2.0 3.00 3.0 ▂▁▇▁▇
knitr::opts_chunk$set(fig.path = "Kim_et_al_photos/")
# BEGIN HEAVY DESCRIPTIVE STATISTICS
## I wanted to separate data out into the two different subtypes (ER and
## Triple Negative), which is a key factor in the paper. In this section I
## focus on the Triple Negative subtype. From this filtered dataset, I
## created variables for the number (counts) of each histological grade as
## well as the percentage of each histological grade within the
## population. I wrote a function that included n within it because later
## on, I will have to filter out some of the NAs that are in the raw
## dataset, making the n value vary from variable to variable, and this
## made it easier.
Triple_neg <- filter(d, Subtype == "3")
n <- nrow(Triple_neg)
T_Hist_grade_1 <- (length(which(Triple_neg$Histologic_full == "1")))
T_hist_grade_2 <- (length(which(Triple_neg$Histologic_full == "2")))
T_hist_grade_3 <- (length(which(Triple_neg$Histologic_full == "3")))
percent <- function(x) {
    perc <- x/256
    return(perc)
}
T_hist_grade1_percent <- percent(T_Hist_grade_1)
T_hist_grade2_percent <- percent(T_hist_grade_2)
T_hist_grade3_percent <- percent(T_hist_grade_3)
## T_Hist_grade tells the number of patients whose identified cancers
## belong to each histological type.  Here I replace the #NULL! with NA so
## that I can filter it out into a new dataframe. I then convert it into a
## numeric so that it can be passed through the built in mean function.
ans <- Triple_neg %>%
    replace(. == "#NULL!", NA)
g <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_T <- as.numeric(g$Size_Pathol_cm_)
Mean_size_Trip_neg <- mean(Mean_Size_T)
## After finding the mean of the size of the triple negative tumor in cm
## after omitting all #NULL! values, I think the data set in the paper
## maybe replaced the Null values with 0 or used the n of 256 to calculate
## the mean because my mean is slightly higher
Range_size_trip_neg <- range(Mean_Size_T)
Range_size_trip_neg
## [1] 0.2 9.3
# Found the range of values for the tumor size. The paper says that the
# range of the values for tumor size are (0.2-5.4), whereas my
# calculations indicate that the range extends up to 9.3.
## Now I filtered the data for Palpability
Triple_neg$Palpability[Triple_neg$Palpability == "1"] <- "No"
Triple_neg$Palpability[Triple_neg$Palpability == "0"] <- "Yes"
## Changed the 1 and 0s found in Palpability to Yes and No values (kind of
## arbitrary, but I thought it was good practice for data coding), and now
## I counted the number of yes and no values.
No <- (length(which(Triple_neg$Palpability == "No")))
Yes <- (length(which(Triple_neg$Palpability == "Yes")))
Percent_palp <- percent(Yes)
Percent_not_palp <- percent(No)
## Calculated the number and percentage of palpable vs not palpable tumors
## in the Triple Negative subtype, using the function as defined above. I
## thought it was easiest to assign them into new variable in order to
## format them into the table properly.
## Now I will assign the numeric positional data to new values which will
## make it easier to filter out the data and do further analyses, again I
## thought variable assignment was the best for later calculations. I
## filtered within one line of code. I know this could be done through
## dplyr, but I wasn't sure how easy the variable reassignment and table
## creation would have been.
UOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "1")))  #codes the data and counts for it within the df
percent_UOQ <- percent(UOQ)  ## percentage of counts from n as the previously described function
UIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "2")))
percent_UIQ <- percent(UIQ)
LOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "3")))
percent_LOQ <- percent(LOQ)
LIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "4")))
percent_LIQ <- percent(LIQ)
periareolar <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "5")))
percent_periareolar <- percent(periareolar)
## I did similar assignments for mediolateral locations.
medial <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "1")))
percent_medial <- percent(medial)
central <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "2")))
percent_central <- percent(central)
lateral <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "3")))
percent_lateral <- percent(lateral)
## Similar assignments for anteroposterior locations.
anterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "1")))
percent_anterior <- percent(anterior)
middle <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "2")))
percent_middle <- percent(middle)
posterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "3")))
percent_posterior <- percent(posterior)
## Now I will filter the data for ER subtype and conduct essentially the
## same descriptive stats on it as I did for TN.
ER_pos <- filter(d, Subtype == "1")  ## Filter out the data into new df
n_E <- nrow(ER_pos)  ##double checking I coded correctly :) 
## Histological Grade filtering for the ER subtype, and wrote a new
## function for percent to account for the change in n (same issue as
## before where the n value is differing from variable to variable).
T_Hist_grade_1_E <- (length(which(ER_pos$Histologic_full == "1")))
T_hist_grade_2_E <- (length(which(ER_pos$Histologic_full == "2")))
T_hist_grade_3_E <- (length(which(ER_pos$Histologic_full == "3")))
percentE <- function(x) {
    perc <- x/846
    return(perc)
}
T_hist_grade1E_percent <- percentE(T_Hist_grade_1_E)
T_hist_grade2E_percent <- percentE(T_hist_grade_2_E)
T_hist_grade3E_percent <- percentE(T_hist_grade_3_E)
## In this, much like in the TN case, I replaced the NULL values with NA
## to remove them from the dataframe to make for easier wrangling.
ans <- ER_pos %>%
    replace(. == "#NULL!", NA)
Eg <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_E <- as.numeric(Eg$Size_Pathol_cm_)
Mean_size_ER <- mean(Mean_Size_E)  ## Mean of ER subtype
Range_size_ER <- range(Eg$Size_Pathol_cm_)  #Range of ER subtype
Range_size_ER
## [1] "0.20" "9.80"
## Again, the values I got for both the range and mean of the subtype
## differ from what is reported. Could be attributed to replacement with
## zeros or averaging with filtered data. Unsure why the range is
## different than reported.... The filtering was done correctly.
## Now ran the statistics for the palpability of ER data.
No_ER <- (length(which(ER_pos$Palpability == "1")))  ## Didn't change to yes or no in the df for ER because that seemed redundant
Yes_ER <- (length(which(ER_pos$Palpability == "0")))
Percent_palpE <- percentE(Yes_ER)
Percent_not_palpE <- percentE(No_ER)
## Now filter out the ER data based on each quadrant and assign variables
## to each to put into a table. Calculate the percentages as well based on
## the function that I wrote previously.
UOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "1")))
percent_UOQ_E <- percentE(UOQ_E)
UIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "2")))
percent_UIQ_E <- percent(UIQ_E)
LOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "3")))
percent_LOQ_E <- percentE(LOQ_E)
LIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "4")))
percent_LIQ_E <- percentE(LIQ_E)
periareolar_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "5")))
percent_periareolar_E <- percentE(periareolar_E)
## ER: Do the same for mediolateral locations. Assign to variables to be
## put in a table.
medial_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "1")))
percent_medial_E <- percentE(medial_E)
central_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "2")))
percent_central_E <- percentE(central_E)
lateral_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "3")))
percent_lateral_E <- percentE(lateral_E)
# ER: DO the same assignments for anteroposterior locations
anterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "1")))
percent_anterior_E <- percentE(anterior_E)
middle_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "2")))
percent_middle_E <- percentE(middle_E)
posterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "3")))
percent_posterior_E <- percentE(posterior_E)
# Make variables into a table that matches the one in the paper.
a <- c("Palpability_no", "Palpability_yes", "Upper Outer Quadrant", "Upper Inner Quadrant",
    "Lower Outer Quadrant", "Lower Inner Quadrant", "Periareolar", "Mediolateral_Medial",
    "Mediolateral_Central", "Mediolateral_Lateral", "Anteroposterior_Anterior",
    "Anteroposterior_Middle", "Anteroposterior_Posterior", "Histological Grade 1",
    "Histological Grade 2", "Histological Grade 3")
b <- c(Yes, No, UOQ, UIQ, LOQ, LIQ, periareolar, medial, central, lateral, anterior,
    middle, posterior, T_Hist_grade_1, T_hist_grade_2, T_hist_grade_3)
c <- 100 * c(Percent_palp, Percent_not_palp, percent_UOQ, percent_UIQ, percent_LOQ,
    percent_LIQ, percent_periareolar, percent_medial, percent_central, percent_lateral,
    percent_anterior, percent_middle, percent_posterior, T_hist_grade1_percent,
    T_hist_grade2_percent, T_hist_grade3_percent)
d <- c(Yes_ER, No_ER, UOQ_E, UIQ_E, LOQ_E, LIQ_E, periareolar_E, medial_E, central_E,
    lateral_E, anterior_E, middle_E, posterior_E, T_Hist_grade_1_E, T_hist_grade_2_E,
    T_hist_grade_3_E)
e <- 100 * c(Percent_palpE, Percent_not_palpE, percent_UOQ_E, percent_UIQ_E,
    percent_LOQ_E, percent_LIQ_E, percent_periareolar_E, percent_medial_E, percent_central_E,
    percent_lateral_E, percent_anterior_E, percent_middle_E, percent_posterior_E,
    T_hist_grade1E_percent, T_hist_grade2E_percent, T_hist_grade3E_percent)
bind_col <- data.frame(Variables = a, Triple_Negative_Counts = b, Triple_Negative_Percentages = c,
    ER_Counts = d, ER_Percentages = e)
kable(bind_col, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
Variables Triple_Negative_Counts Triple_Negative_Percentages ER_Counts ER_Percentages
Palpability_no 60 23.438 379 44.799
Palpability_yes 196 76.562 467 55.201
Upper Outer Quadrant 126 49.219 375 44.326
Upper Inner Quadrant 73 28.516 225 87.891
Lower Outer Quadrant 26 10.156 112 13.239
Lower Inner Quadrant 18 7.031 61 7.210
Periareolar 13 5.078 73 8.629
Mediolateral_Medial 51 19.922 186 21.986
Mediolateral_Central 100 39.062 310 36.643
Mediolateral_Lateral 105 41.016 350 41.371
Anteroposterior_Anterior 17 6.641 80 9.456
Anteroposterior_Middle 99 38.672 396 46.809
Anteroposterior_Posterior 140 54.688 370 43.735
Histological Grade 1 2 0.781 125 14.775
Histological Grade 2 35 13.672 478 56.501
Histological Grade 3 219 85.547 243 28.723
bind_col
##                    Variables Triple_Negative_Counts Triple_Negative_Percentages
## 1             Palpability_no                     60                   23.437500
## 2            Palpability_yes                    196                   76.562500
## 3       Upper Outer Quadrant                    126                   49.218750
## 4       Upper Inner Quadrant                     73                   28.515625
## 5       Lower Outer Quadrant                     26                   10.156250
## 6       Lower Inner Quadrant                     18                    7.031250
## 7                Periareolar                     13                    5.078125
## 8        Mediolateral_Medial                     51                   19.921875
## 9       Mediolateral_Central                    100                   39.062500
## 10      Mediolateral_Lateral                    105                   41.015625
## 11  Anteroposterior_Anterior                     17                    6.640625
## 12    Anteroposterior_Middle                     99                   38.671875
## 13 Anteroposterior_Posterior                    140                   54.687500
## 14      Histological Grade 1                      2                    0.781250
## 15      Histological Grade 2                     35                   13.671875
## 16      Histological Grade 3                    219                   85.546875
##    ER_Counts ER_Percentages
## 1        379      44.799054
## 2        467      55.200946
## 3        375      44.326241
## 4        225      87.890625
## 5        112      13.238771
## 6         61       7.210402
## 7         73       8.628842
## 8        186      21.985816
## 9        310      36.643026
## 10       350      41.371158
## 11        80       9.456265
## 12       396      46.808511
## 13       370      43.735225
## 14       125      14.775414
## 15       478      56.501182
## 16       243      28.723404
knitr::include_graphics("Kim_et_al_photos/count_table.jpg")

knitr::include_graphics("Kim_et_al_photos/location_table.jpg")

## Discussion of results: All of the counts and percentage data matched
## the data found in the paper. This means that even though some data
## points were deleted by omitting the NULL values in the raw dataset, the
## n values as reported were used to calculate the percentages. While this
## was a simple data analysis for descriptive stats, it tells us
## confidently that the variables used later on for regression analysis
## were correct.
# More Descriptive Stats
## Need to filter out the normalized distance from the chest wall and
## distance from the chest wall, need to replace the NULL with NA in order
## to have a clean dataset that I can use for future calculations. I
## decided to filter ALL of the positional data , so that I could use it
## for visualization in the next step.
Triple_neg_normY <- Triple_neg$Y_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normY <- na.omit(Triple_neg_normY)
m_Triple_neg_normY <- mean(Triple_neg_normY)  ## Filtering out the y-normalized distances from dataset.
Triple_neg_normX <- Triple_neg$X_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normX <- na.omit(Triple_neg_normX)
m_Triple_neg_normX <- mean(Triple_neg_normX)  ## Filtering out the x-normalized distances from dataset.
Triple_neg_normZ <- Triple_neg$Z_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normZ <- na.omit(Triple_neg_normZ)
m_Triple_neg_normZ <- mean(Triple_neg_normZ)  ## Filtering out the z-normalized distances from dataset.
Triple_neg_Y <- Triple_neg$Y_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_Y <- na.omit(Triple_neg_Y)
m_Triple_neg_Y <- mean(Triple_neg_Y)  ## Filtering out the y-absolute distances from dataset.
Triple_neg_X <- Triple_neg$X_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_X <- na.omit(Triple_neg_X)
m_Triple_neg_X <- mean(Triple_neg_X)  ## Filtering out the x-absolute distances from dataset.
Triple_neg_Z <- Triple_neg$Z_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_Z <- na.omit(Triple_neg_Z)
m_Triple_neg_Z <- mean(Triple_neg_Z)  ## Filtering out the z-absolute distances from dataset.
## Do the same positional filtering for ER data.
ER_normY <- ER_pos$Y_normalized %>%
    replace(. == "#NULL!", NA)
ER_normY <- na.omit(ER_normY)
m_ER_normY <- mean(ER_normY)
ER_normX <- ER_pos$X_normalized %>%
    replace(. == "#NULL!", NA)
ER_normX <- na.omit(ER_normX)
m_ER_normX <- mean(ER_normX)
ER_normZ <- ER_pos$Z_normalized %>%
    replace(. == "#NULL!", NA)
ER_normZ <- na.omit(ER_normZ)
m_ER_normZ <- mean(ER_normZ)
ER_Y <- ER_pos$Y_distance %>%
    replace(. == "#NULL!", NA)
ER_Y <- na.omit(ER_Y)
m_ER_Y <- mean(ER_Y)
ER_X <- ER_pos$X_distance %>%
    replace(. == "#NULL!", NA)
ER_X <- na.omit(ER_X)
m_ER_X <- mean(ER_X)
ER_Z <- ER_pos$Z_distance %>%
    replace(. == "#NULL!", NA)
ER_Z <- na.omit(ER_Z)
m_ER_Z <- mean(ER_Z)
## Begin doing inferential stats, assume normal distribution for standard
## error and confidence intervals. I created a function for both of them
## and will use it to define variables for inferential statistical values
## for the positional data in the future.
SE <- function(x, type = "normal") {
    if (type == "normal") {
        SE <- sd(x)/sqrt(length(x))
    }
    if (type == "poisson") {
        SE <- sqrt(mean(x)/length(x))
        # mean(x) is estimate of lambda
    }
    return(SE)
}
CI_norm <- function(x, alpha = 0.05) {
    CI <- mean(x) + c(-1, 1) * qnorm(1 - alpha/2) * SE(x)
    # confidence interval based on normal distribution
    names(CI) <- c("CI norm L", "CI norm U")
    return(CI)
}
## Now take the data and filter it by positional filters and pass through
## the functions needed to get the variables of interest for the summary
## table that will later be created.
ER_means <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~mean(.)))  ## ER: finding mean of variables
ER_SD <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~sd(.)))  ## ER:finding SD of variables
ER_SEs <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))  ## ER: Assume normal distribution, finding standard error of variables
ER_CI_norm <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~CI_norm(.)))  ## ER:Finding 95% confidence intervals
TN_means <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~mean(.)))  ## TN: finding mean of variables
TN_SD <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~sd(.)))  ## TN:finding SD of variables
TN_SEs <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))  ## TN: Assume normal distribution, finding standard error of variables
TN_CI_norm <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~CI_norm(.)))  ## TN:Finding 95% confidence intervals
## Now I will perform t-tests on on the variables/filtered data in order
## to extract the p-values needed for the tables.
X_norm <- t.test(ER_normX, Triple_neg_normX)
p_x_norm <- X_norm$p.value
Y_norm <- t.test(ER_normY, Triple_neg_normY)
p_y_norm <- Y_norm$p.value
Z_norm <- t.test(ER_normZ, Triple_neg_normZ)
p_Z_norm <- Z_norm$p.value
Z_dist <- t.test(ER_Z, Triple_neg_Z)
p_Z <- Z_dist$p.value
x_dist <- t.test(ER_X, Triple_neg_X)
p_x <- x_dist$p.value
y_dist <- t.test(ER_Y, Triple_neg_Y)
p_y <- y_dist$p.value
p <- c(p_y, p_x, p_Z, p_y_norm, p_x_norm, p_Z_norm)
## Create the second descriptive table found in the paper using the data
## that I filtered and calculated.
samp_1_summary <- as_tibble(t(bind_rows(ER_means, TN_means, ER_SD, TN_SD, ER_SEs,
    TN_SEs, ER_CI_norm, TN_CI_norm)), .name_repair = "minimal")
Samp_fin_sum <- bind_cols(samp_1_summary, p)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * ...
# p<-as_tibble(t(bind_rows(p_y, p_x, p_Z, p_y_norm, p_x_norm, p_Z_norm,
# .name_repair = 'minimal')))
names(Samp_fin_sum) <- c(" ER Mean", "TN Mean", "ER SD", "TN SD", "ER Std Error",
    "TN Std Error", "ER Confidence Interval L", "ER Confidence Interval U",
    "TN Confidence Interval L", "TN Confidence Interval U", "P Value")
variables_sum <- tibble(Variable = c("Y_distance", "X_distance", "Z_distance",
    "Y_normalized", "X_normalized", "Z_normalized"))
samp_summary <- bind_cols(variables_sum, Samp_fin_sum)
kable(samp_summary, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
Variable ER Mean TN Mean ER SD TN SD ER Std Error TN Std Error ER Confidence Interval L ER Confidence Interval U TN Confidence Interval L TN Confidence Interval U P Value
Y_distance 2.300 1.841 1.932 1.645 0.066 0.103 2.169 2.430 1.640 2.043 0.000
X_distance 0.430 0.498 1.516 1.491 0.052 0.093 0.328 0.533 0.316 0.681 0.524
Z_distance 0.912 1.179 2.021 2.140 0.069 0.134 0.776 1.048 0.917 1.441 0.077
Y_normalized 0.253 0.209 0.183 0.170 0.006 0.011 0.241 0.265 0.188 0.229 0.000
X_normalized 0.054 0.057 0.176 0.172 0.006 0.011 0.042 0.066 0.036 0.079 0.768
Z_normalized 0.059 0.075 0.130 0.139 0.004 0.009 0.050 0.067 0.058 0.092 0.099
samp_summary
## # A tibble: 6 × 12
##   Variable    ` ER Mean` `TN Mean` `ER SD` `TN SD` `ER Std Error` `TN Std Error`
##   <chr>            <dbl>     <dbl>   <dbl>   <dbl>          <dbl>          <dbl>
## 1 Y_distance      2.30      1.84     1.93    1.65         0.0664         0.103  
## 2 X_distance      0.430     0.498    1.52    1.49         0.0521         0.0932 
## 3 Z_distance      0.912     1.18     2.02    2.14         0.0695         0.134  
## 4 Y_normaliz…     0.253     0.209    0.183   0.170        0.00630        0.0106 
## 5 X_normaliz…     0.0538    0.0574   0.176   0.172        0.00605        0.0108 
## 6 Z_normaliz…     0.0585    0.0746   0.130   0.139        0.00446        0.00867
## # … with 5 more variables: ER Confidence Interval L <dbl>,
## #   ER Confidence Interval U <dbl>, TN Confidence Interval L <dbl>,
## #   TN Confidence Interval U <dbl>, P Value <dbl>
knitr::include_graphics("Kim_et_al_photos/distance_table.jpg")

## Discussion: Overall the values are close to what is seen in the paper,
## but overall they are a little bit higher. When I calculated ranges
## earlier on for exploratory descriptive analysis, it looked like some of
## the outliars for tumor size were excluded from further analysis because
## it included an upper bound that was much higher than reported in the
## literature. I am assuming this is what is slightly skewing some of the
## values (I only did ranges for the y-values, but it would be interesting
## to see how the others compared).
# Inferential Stats Do inferential stats for the paper, no distinction
# between TN and ER for calculations for the beta coefficients Reading in
# the data again because so many variables were defined before that it is
# just simpler for my brain to refilter based off of what is needed from
# the total dataset
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ans2 <- d %>%
    replace(. == "#NULL!", NA)
ans2 <- na.omit(ans2)
## Redefine variables from the filtered dataset and convert them to
## numeric so that I can do further calculations on them more easily
Size_pathol <- as.numeric(ans2$Size_Pathol_cm_)
Hist_grade <- as.numeric(ans2$Histologic_full)
Subtype <- as.numeric(ans2$Subtype)
## Define the absolute and normalized y distances as numeric variable for
## use in later calculations
normY_d <- ans2$Y_normalized  ## normalized
Y_d <- ans2$Y_distance  ##Absolute
## Hand calculate the beta coefficients of normalized y-axis and tumor
## size before doing any of the built in calculations.
beta1_size <- cor(normY_d, Size_pathol) * (sd(normY_d)/sd(Size_pathol))
beta1_size
## [1] -0.008155774
(beta0_size <- mean(normY_d) - beta1_size * mean(Size_pathol))
## [1] 0.2633268
beta0_size  #do hand calculations of beta0 for normalized y distance am size of tumor
## [1] 0.2633268
## Now do standard error of b1 and b0 for normalized y distance am size of
## tumor by hand
n <- nrow(ans2)
df <- n - 2
mean_x <- mean(Size_pathol)
y_pred = beta0_size + beta1_size * normY_d
y_error = normY_d - y_pred
std_err_b1_dist <- sqrt((sum(y_error^2))/((n - 2) * sum((Size_pathol - mean_x)^2)))
std_err_b1_dist
## [1] 0.004522476
## I will use the lm function to evaluate the rest of the variables BUT if
## I was to do it by hand, it would follow the formats about for
## beta1_size, beta1_size_norm, beta0_size, and beta0_size_norm
## Now utilize the lm function to double check calculations and calculate
## the beta coefficients of absolute/normalized against selected variables
## (subtype, size, histological grade). I chose subtype because it is
## foundational to the paper, size becuase the data is continuous, and
## histological grade because it was good for me to practice categorical
## data analysis and it was in a relatively clean format.
m_size_path_norm <- lm(normY_d ~ Size_pathol)
summary(m_size_path_norm)
## 
## Call:
## lm(formula = normY_d ~ Size_pathol)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26170 -0.14620 -0.04294  0.14483  0.64646 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.263327   0.012372   21.29   <2e-16 ***
## Size_pathol -0.008156   0.004457   -1.83   0.0675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared:  0.003308,   Adjusted R-squared:  0.00232 
## F-statistic: 3.349 on 1 and 1009 DF,  p-value: 0.06754
std_err_pn <- coef(summary(m_size_path_norm))[, "Std. Error"]  ##Pull out standard error for table
m_Hist_grade_norm <- lm(normY_d ~ Hist_grade)
summary(m_Hist_grade_norm)  #This one seems to differ 
## 
## Call:
## lm(formula = normY_d ~ Hist_grade)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27989 -0.15185 -0.02381  0.12011  0.67619 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.307928   0.020476  15.039  < 2e-16 ***
## Hist_grade  -0.028039   0.008526  -3.289  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared:  0.01061,    Adjusted R-squared:  0.009625 
## F-statistic: 10.82 on 1 and 1009 DF,  p-value: 0.001041
std_err_hgn <- coef(summary(m_Hist_grade_norm))[, "Std. Error"]
m_Subtype_norm <- lm(normY_d ~ Subtype)
summary(m_Subtype_norm)
## 
## Call:
## lm(formula = normY_d ~ Subtype)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25448 -0.15448 -0.05448  0.14552  0.69520 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.279312   0.011346  24.619  < 2e-16 ***
## Subtype     -0.024836   0.006766  -3.671 0.000254 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared:  0.01318,    Adjusted R-squared:  0.0122 
## F-statistic: 13.48 on 1 and 1009 DF,  p-value: 0.0002544
std_err_sn <- coef(summary(m_Subtype_norm))[, "Std. Error"]
m_size_path <- lm(Y_d ~ Size_pathol)
summary(m_size_path)
## 
## Call:
## lm(formula = Y_d ~ Size_pathol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3013 -1.4678 -0.3680  0.9822  7.7988 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.36809    0.12893  18.368   <2e-16 ***
## Size_pathol -0.06675    0.04644  -1.437    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared:  0.002043,   Adjusted R-squared:  0.001054 
## F-statistic: 2.066 on 1 and 1009 DF,  p-value: 0.1509
std_err_sp <- coef(summary(m_size_path))[, "Std. Error"]
m_Hist_grade <- lm(Y_d ~ Hist_grade)
summary(m_Hist_grade)  #This one seems to differ 
## 
## Call:
## lm(formula = Y_d ~ Hist_grade)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4707 -1.4899 -0.4091  0.9909  7.9909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.85152    0.21333  13.367  < 2e-16 ***
## Hist_grade  -0.28079    0.08883  -3.161  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared:  0.009806,   Adjusted R-squared:  0.008825 
## F-statistic: 9.993 on 1 and 1009 DF,  p-value: 0.001619
std_err_hg <- coef(summary(m_Hist_grade))[, "Std. Error"]
m_Subtype <- lm(Y_d ~ Subtype)
summary(m_Subtype)
## 
## Call:
## lm(formula = Y_d ~ Subtype)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3162 -1.5162 -0.4162  0.9838  7.7838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.5650     0.1182  21.697  < 2e-16 ***
## Subtype      -0.2487     0.0705  -3.528 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  0.01121 
## F-statistic: 12.45 on 1 and 1009 DF,  p-value: 0.000437
std_err_s <- coef(summary(m_Subtype))[, "Std. Error"]
#'In statistics, standardized (regression) coefficients, also called beta coefficients or beta weights, are the estimates resulting from a regression analysis where the underlying data have been standardized so that the variances of dependent and independent variables are equal to 1.' source: https://en.wikipedia.org/wiki/Standardized_coefficient, the lm.beta package allows us to calculate this easily, and will be done for the previously used variables. 
Std_beta_coeff_Y_norm_size <- lm.beta(m_size_path_norm)
summary(Std_beta_coeff_Y_norm_size)
## 
## Call:
## lm(formula = normY_d ~ Size_pathol)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26170 -0.14620 -0.04294  0.14483  0.64646 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.263327     0.000000   0.012372   21.29   <2e-16 ***
## Size_pathol -0.008156    -0.057517   0.004457   -1.83   0.0675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared:  0.003308,   Adjusted R-squared:  0.00232 
## F-statistic: 3.349 on 1 and 1009 DF,  p-value: 0.06754
coeff_size_norm <- m_size_path_norm$coefficients
std_coeff_size_norm <- Std_beta_coeff_Y_norm_size$standardized.coefficients  ## pull out the value from the lm table of standardized coefficicents
Std_beta_coeff_Y_size <- lm.beta(m_size_path)
summary(Std_beta_coeff_Y_size)
## 
## Call:
## lm(formula = Y_d ~ Size_pathol)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3013 -1.4678 -0.3680  0.9822  7.7988 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  2.36809      0.00000    0.12893  18.368   <2e-16 ***
## Size_pathol -0.06675     -0.04520    0.04644  -1.437    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared:  0.002043,   Adjusted R-squared:  0.001054 
## F-statistic: 2.066 on 1 and 1009 DF,  p-value: 0.1509
coeff_size <- m_size_path$coefficients
std_coeff_size <- Std_beta_coeff_Y_size$standardized.coefficients
std_beta_hist_norm <- lm.beta(m_Hist_grade_norm)
summary(std_beta_hist_norm)
## 
## Call:
## lm(formula = normY_d ~ Hist_grade)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27989 -0.15185 -0.02381  0.12011  0.67619 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.307928     0.000000   0.020476  15.039  < 2e-16 ***
## Hist_grade  -0.028039    -0.102983   0.008526  -3.289  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared:  0.01061,    Adjusted R-squared:  0.009625 
## F-statistic: 10.82 on 1 and 1009 DF,  p-value: 0.001041
coeff_hist_norm <- m_Hist_grade_norm$coefficients
std_coeff_hist_norm <- std_beta_hist_norm$standardized.coefficients
std_beta_hist <- lm.beta(m_Hist_grade)
summary(m_Hist_grade)
## 
## Call:
## lm(formula = Y_d ~ Hist_grade)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4707 -1.4899 -0.4091  0.9909  7.9909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.85152    0.21333  13.367  < 2e-16 ***
## Hist_grade  -0.28079    0.08883  -3.161  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared:  0.009806,   Adjusted R-squared:  0.008825 
## F-statistic: 9.993 on 1 and 1009 DF,  p-value: 0.001619
coeff_hist <- m_Hist_grade$coefficients
std_coeff_hist <- std_beta_hist$standardized.coefficients
std_beta_sub_norm <- lm.beta(m_Subtype_norm)
summary(std_beta_sub_norm)
## 
## Call:
## lm(formula = normY_d ~ Subtype)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25448 -0.15448 -0.05448  0.14552  0.69520 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.279312     0.000000   0.011346  24.619  < 2e-16 ***
## Subtype     -0.024836    -0.114801   0.006766  -3.671 0.000254 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared:  0.01318,    Adjusted R-squared:  0.0122 
## F-statistic: 13.48 on 1 and 1009 DF,  p-value: 0.0002544
coeff_norm_sub <- m_Subtype_norm$coefficients
std_coeff_norm_sub <- std_beta_sub_norm$standardized.coefficients
std_beta_sub <- lm.beta(m_Subtype)
summary(std_beta_sub)
## 
## Call:
## lm(formula = Y_d ~ Subtype)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3162 -1.5162 -0.4162  0.9838  7.7838 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)   2.5650       0.0000     0.1182  21.697  < 2e-16 ***
## Subtype      -0.2487      -0.1104     0.0705  -3.528 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  0.01121 
## F-statistic: 12.45 on 1 and 1009 DF,  p-value: 0.000437
coeff_sub <- m_Subtype$coefficients
std_coeff_sub <- std_beta_sub$standardized.coefficients
## Now creating the table that is seen in the paper for inferential stats
## (beta, standard err, standardized beta). Pulling all of the 'outputs'
## out of the summary table and creating a tibble to bind together into
## the larger table.
samp_2_summary <- as_tibble(t(bind_rows(coeff_hist, std_err_hg, std_coeff_hist)),
    .name_repair = "minimal")
samp_2_summary1 <- as_tibble(t(bind_rows(coeff_hist_norm, std_err_hgn, std_coeff_hist_norm)),
    .name_repair = "minimal")
samp_3_summary <- as_tibble(t(bind_rows(coeff_size, std_err_sp, std_coeff_size)),
    .name_repair = "minimal")
samp_3_summary1 <- as_tibble(t(bind_rows(coeff_size_norm, std_err_pn, std_coeff_size_norm)),
    .name_repair = "minimal")
samp_4_summary <- as_tibble(t(bind_rows(coeff_sub, std_err_s, std_coeff_sub)),
    .name_repair = "minimal")
samp_4_summary1 <- as_tibble(t(bind_rows(coeff_norm_sub, std_err_sn, std_coeff_norm_sub)),
    .name_repair = "minimal")
samp_2_summary <- samp_2_summary[-c(1), ]
samp_3_summary <- samp_3_summary[-c(1), ]
samp_4_summary <- samp_4_summary[-c(1), ]
samp_2_summary1 <- samp_2_summary1[-c(1), ]
samp_3_summary1 <- samp_3_summary1[-c(1), ]
samp_4_summary1 <- samp_4_summary1[-c(1), ]
Y_dist_data <- bind_rows(samp_2_summary, samp_3_summary, samp_4_summary, samp_2_summary1,
    samp_3_summary1, samp_4_summary1)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
names(Y_dist_data) <- c("Beta Coefficient", "Standard Error", "Standardized Beta Coefficient")
variables <- tibble(Variable = c("Absolute y-axis Histological Grade", "Absolute y-axis Tumor size(cm)",
    "Absolute y-axis Triple Negative", "Normalized y-axis Histological Grade",
    "Normalized y-axis Tumor Size", " Normalized y-axis Triple Negative"))
Y_dist_data <- bind_cols(variables, Y_dist_data)
kable(Y_dist_data, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
Variable Beta Coefficient Standard Error Standardized Beta Coefficient
Absolute y-axis Histological Grade -0.281 0.089 -0.099
Absolute y-axis Tumor size(cm) -0.067 0.046 -0.045
Absolute y-axis Triple Negative -0.249 0.070 -0.110
Normalized y-axis Histological Grade -0.028 0.009 -0.103
Normalized y-axis Tumor Size -0.008 0.004 -0.058
Normalized y-axis Triple Negative -0.025 0.007 -0.115
Y_dist_data
## # A tibble: 6 × 4
##   Variable              `Beta Coefficie… `Standard Error` `Standardized Beta Co…
##   <chr>                            <dbl>            <dbl>                  <dbl>
## 1 "Absolute y-axis His…         -0.281            0.0888                 -0.0990
## 2 "Absolute y-axis Tum…         -0.0668           0.0464                 -0.0452
## 3 "Absolute y-axis Tri…         -0.249            0.0705                 -0.110 
## 4 "Normalized y-axis H…         -0.0280           0.00853                -0.103 
## 5 "Normalized y-axis T…         -0.00816          0.00446                -0.0575
## 6 " Normalized y-axis …         -0.0248           0.00677                -0.115
knitr::include_graphics("Kim_et_al_photos/inferential_table.jpg")

# Discussion: Much like the previous section, the beta coefficients that I
# ran are lower than what is reported in the literature. Again, I think
# this could be due to the fact that the data they ran their analyses on
# likely included outliar testing (because the ranges of the data differ
# from my findings to the literature). The regression coefficients are
# relatively close for Tumor Size and Subtype, however. For Histological
# grade it is significantly different. I am afraid that this is due to the
# fact it is categorical data and I did not account for that. The
# standardized beta coefficients are also lower (except for the normalized
# Y tumor size), but they are closer in comparison overall (probably due
# to standardization).
# DATA VISUALIZATION
## Do data visualization now. No real visualization in the paper, so I
## chose a 3D plot of the positional data of the different cancer subtypes
## for both normalized and absolute. This allows me to visualize the
## distribution of where each tumor is located and how the normalization
## shifts the positions of the data.
Norm_ER__plot <- scatter3D(ER_normX, ER_normY, ER_normZ, clab = c("ER Normalized",
    "Position (cm)"))

Abs_ER__plot <- scatter3D(ER_X, ER_Y, ER_Z, clab = c(" ER Absolute", "Position (cm)"))

Norm_TN__plot <- scatter3D(Triple_neg_normX, Triple_neg_normY, Triple_neg_normZ,
    clab = c("TN Normalized", "Position (cm)"))

Abs_TN__plot <- scatter3D(Triple_neg_X, Triple_neg_Y, Triple_neg_Z, clab = c("TN Absolute",
    "Position (cm)"))

## When the positional data is normalized, it looks very 'normal' (go
## figure!), there is no skewness to it, which makes sense since it all is
## normalized within the spread of the data, so the center should be at
## point zero.
## Because the data is categorical, I decided that it didn't make much
## sense to plot all of them as scatter plots, but that a dot and whisker
## plot might be more intuitive. The dots represent the regression
## coefficients of the two separate models and the whiskers represent the
## 99% confidence intervals (as specified in the code)
dwplot(list(m_Hist_grade, m_Hist_grade_norm, m_Subtype, m_Subtype_norm, m_size_path,
    m_size_path_norm), ci = 0.99, show_intercept = FALSE) + scale_y_discrete(labels = c(Subtype = "Subtype",
    Hist_grade = "Histological Grade", Size_pathol = "Tumor Size (cm)")) + scale_color_hue(labels = c("Normalized y-axis position",
    "Absolute y-axis position", "Normalized y-axis position", "Absolute y-axis position",
    "Normalized y-axis position", "Absolute y-axis position")) + theme(plot.title = element_text(face = "bold")) +
    ggtitle("Regression Coefficients for Breast Tumor Y-axis positions") + xlab("Standardized Beta Coefficient")

# Normalized y-axis and Tumor Size
plot(m_size_path_norm)

# Absolute y-axis and Tumor Size
plot(m_size_path)

## Discussion: In the dot and whisker plot we can see that the 99%
## confidence intervals for the absolute y-valus are much broader than
## those of the normalized points. In terms of the residuals vs fitted
## plot for both y values, the data does not appear to have any linearity-
## I am not sure what type of model I would even fit it to because it is
## so left skewed. I believe that this further supports my hypothesis that
## there was a significant amount of data cleaning for outliars in the
## post processing that would allow for a linear model. For the QQ plots,
## the absolute y-axis QQplot looks to be very nicely normally
## distributed, whereas the normalized y-axis data looks to be multimodal
## on the qq plot.